Load packages¶
# Manipulations
import pandas as pd
import numpy as np
# Plotting
import plotly.express as px
import plotly.io as pio
import plotly.graph_objects as go
# Other
import warnings
Set theme¶
pio.templates.default = "plotly_white"
pio.renderers.default = "notebook"
warnings.filterwarnings("ignore")
EDA¶
df = pd.read_csv("data/data.csv", encoding = 'ISO-8859-1') # it contains non-ASCII
print(df.dtypes)
print(df.head())
InvoiceNo object
StockCode object
Description object
Quantity int64
InvoiceDate object
UnitPrice float64
CustomerID float64
Country object
dtype: object
InvoiceNo StockCode Description Quantity \
0 536365 85123A WHITE HANGING HEART T-LIGHT HOLDER 6
1 536365 71053 WHITE METAL LANTERN 6
2 536365 84406B CREAM CUPID HEARTS COAT HANGER 8
3 536365 84029G KNITTED UNION FLAG HOT WATER BOTTLE 6
4 536365 84029E RED WOOLLY HOTTIE WHITE HEART. 6
InvoiceDate UnitPrice CustomerID Country
0 12/1/2010 8:26 2.55 17850.0 United Kingdom
1 12/1/2010 8:26 3.39 17850.0 United Kingdom
2 12/1/2010 8:26 2.75 17850.0 United Kingdom
3 12/1/2010 8:26 3.39 17850.0 United Kingdom
4 12/1/2010 8:26 3.39 17850.0 United Kingdom
df.describe()
| Quantity | UnitPrice | CustomerID | |
|---|---|---|---|
| count | 541909.000000 | 541909.000000 | 406829.000000 |
| mean | 9.552250 | 4.611114 | 15287.690570 |
| std | 218.081158 | 96.759853 | 1713.600303 |
| min | -80995.000000 | -11062.060000 | 12346.000000 |
| 25% | 1.000000 | 1.250000 | 13953.000000 |
| 50% | 3.000000 | 2.080000 | 15152.000000 |
| 75% | 10.000000 | 4.130000 | 16791.000000 |
| max | 80995.000000 | 38970.000000 | 18287.000000 |
A negative quantity in this data set represents a "return transaction" instead of a "purchase transaction".
Businesses uses quantity sold * unit price to determine sales revenue.
df[df['UnitPrice'] == 0]
| InvoiceNo | StockCode | Description | Quantity | InvoiceDate | UnitPrice | CustomerID | Country | |
|---|---|---|---|---|---|---|---|---|
| 622 | 536414 | 22139 | NaN | 56 | 12/1/2010 11:52 | 0.0 | NaN | United Kingdom |
| 1970 | 536545 | 21134 | NaN | 1 | 12/1/2010 14:32 | 0.0 | NaN | United Kingdom |
| 1971 | 536546 | 22145 | NaN | 1 | 12/1/2010 14:33 | 0.0 | NaN | United Kingdom |
| 1972 | 536547 | 37509 | NaN | 1 | 12/1/2010 14:33 | 0.0 | NaN | United Kingdom |
| 1987 | 536549 | 85226A | NaN | 1 | 12/1/2010 14:34 | 0.0 | NaN | United Kingdom |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 536981 | 581234 | 72817 | NaN | 27 | 12/8/2011 10:33 | 0.0 | NaN | United Kingdom |
| 538504 | 581406 | 46000M | POLYESTER FILLER PAD 45x45cm | 240 | 12/8/2011 13:58 | 0.0 | NaN | United Kingdom |
| 538505 | 581406 | 46000S | POLYESTER FILLER PAD 40x40cm | 300 | 12/8/2011 13:58 | 0.0 | NaN | United Kingdom |
| 538554 | 581408 | 85175 | NaN | 20 | 12/8/2011 14:06 | 0.0 | NaN | United Kingdom |
| 538919 | 581422 | 23169 | smashed | -235 | 12/8/2011 15:24 | 0.0 | NaN | United Kingdom |
2515 rows × 8 columns
Weird, there are invoices with 0 unit price? It could be they are free items or complimentary goods for customers. Will keep them.
print(df.isna().sum())
print(df.isnull().sum() / df.shape[0] * 100)
InvoiceNo 0 StockCode 0 Description 1454 Quantity 0 InvoiceDate 0 UnitPrice 0 CustomerID 135080 Country 0 dtype: int64 InvoiceNo 0.000000 StockCode 0.000000 Description 0.268311 Quantity 0.000000 InvoiceDate 0.000000 UnitPrice 0.000000 CustomerID 24.926694 Country 0.000000 dtype: float64 InvoiceNo 0.000000 StockCode 0.000000 Description 0.268311 Quantity 0.000000 InvoiceDate 0.000000 UnitPrice 0.000000 CustomerID 24.926694 Country 0.000000 dtype: float64
About 25% of the entries have missing customer IDs.
missing_ids = df[df['CustomerID'].isna()].groupby('InvoiceDate').size().reset_index(name = 'Count')
fig = px.line(missing_ids, x = 'InvoiceDate', y = 'Count',
title = 'There are missing Customer IDs throughout the whole time period.')
fig.show()
df[df['CustomerID'].isnull()][['UnitPrice', 'Quantity']].describe()
| UnitPrice | Quantity | |
|---|---|---|
| count | 135080.000000 | 135080.000000 |
| mean | 8.076577 | 1.995573 |
| std | 151.900816 | 66.696153 |
| min | -11062.060000 | -9600.000000 |
| 25% | 1.630000 | 1.000000 |
| 50% | 3.290000 | 1.000000 |
| 75% | 5.450000 | 3.000000 |
| max | 17836.460000 | 5568.000000 |
- Entries missing a customer ID exhibit outliers in both price and quantity.
- Not sure, what could be the reason behind missing IDs, but this is a bad practice and the retailer needs to consider data quality checks.
Manipulations¶
Extracting dates ...
df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate']).dt.date
... extracting cancelled invoices.
According to the Variable Description, if the InvoiceNo starts with "C" then it means it's cancelled.
df['IsCancelled'] = np.where(df.InvoiceNo.apply(lambda x: x[0] == "C"), True, False)
... adding revenue.
df['SalesRevenue'] = df['UnitPrice'] * df['Quantity']
... and adding other variables that might be interesting for later analysis.
df['Pennies'] = df['UnitPrice'] % 1
df['Pounds'] = df['UnitPrice'].astype('int')
bins = [-100000, 0, 1, 5, 10, 20, 50, 100, 1000, 100000]
df['PoundsRange'] = pd.cut(df['Pounds'], bins = bins,include_lowest = True)
df['PoundsRange'] = df['PoundsRange'].apply(lambda x: x.right).astype('int')
Let's go!
Since I had just added the cancelled variable, let's see if that is important to take into consideration.
df['IsCancelled'].value_counts() / df.shape[0] * 100
False 98.286059 True 1.713941 Name: IsCancelled, dtype: float64
A very small share is cancelled. Is this worrying anyways?
df[df['IsCancelled']].describe()
| Quantity | UnitPrice | CustomerID | SalesRevenue | Pennies | Pounds | PoundsRange | |
|---|---|---|---|---|---|---|---|
| count | 9288.000000 | 9288.000000 | 8905.000000 | 9288.000000 | 9288.000000 | 9288.000000 | 9288.000000 |
| mean | -29.885228 | 48.393661 | 14991.667266 | -96.556039 | 0.612115 | 47.781546 | 733.569121 |
| std | 1145.786965 | 666.600430 | 1706.772357 | 2039.905511 | 0.302846 | 666.613332 | 8399.176224 |
| min | -80995.000000 | 0.010000 | 12346.000000 | -168469.600000 | 0.000000 | 0.000000 | 0.000000 |
| 25% | -6.000000 | 1.450000 | 13510.000000 | -17.700000 | 0.290000 | 1.000000 | 1.000000 |
| 50% | -2.000000 | 2.950000 | 14895.000000 | -8.500000 | 0.650000 | 2.000000 | 5.000000 |
| 75% | -1.000000 | 5.950000 | 16393.000000 | -3.700000 | 0.950000 | 5.000000 | 5.000000 |
| max | -1.000000 | 38970.000000 | 18282.000000 | -0.120000 | 0.990000 | 38970.000000 | 100000.000000 |
- There is a wide range across quantities and prices.
- All cancelled orders have negative quantites but positive unit prices.
Let's see the progress in time.
invoices_in_time = df.groupby('InvoiceDate')['Quantity'].sum().reset_index(name = 'Quantity')
fig = px.line(invoices_in_time, x = 'InvoiceDate', y = 'Quantity',
title = 'In terms of Quantity sold, there was a period of growth from September on.')
fig.show()
OK, how's the revenue then?
invoices_in_time = df.groupby('InvoiceDate')['SalesRevenue'].sum().reset_index(name = 'SalesRevenue')
fig = px.line(invoices_in_time, x = 'InvoiceDate', y = 'SalesRevenue',
title = 'The daily Sales Revenue reached 100k in September and November.')
fig.show()
And how are our items priced?
fig = px.histogram(df['UnitPrice'][df['UnitPrice'] >= 0], x = 'UnitPrice',
nbins = 100, log_y = True,
title = 'Most items are in the lower price range.')
fig.show()
fig = px.histogram(df['UnitPrice'][(df['UnitPrice'] >= 0) & (df['UnitPrice'] <= 100)], x = 'UnitPrice',
nbins = 100,
title = 'Looking closer, up to £100, a high range of items are between £1 - £2.99.')
fig.show()
So, what are we actually selling?
df['StockCode'].nunique()
4070
Over 4000 of different product items.
df['IsValidStockCode'] = df['StockCode'].astype(str).str.match(r'^\d{5}')
print(df[['StockCode', 'Description']][df['IsValidStockCode'] == False])
print(df[df['IsValidStockCode'] == False]['Description'].unique())
StockCode Description 45 POST POSTAGE 141 D Discount 386 POST POSTAGE 1123 POST POSTAGE 1423 C2 CARRIAGE ... ... ... 541540 DOT DOTCOM POSTAGE 541541 M Manual 541730 POST POSTAGE 541767 POST POSTAGE 541768 POST POSTAGE [2995 rows x 2 columns] ['POSTAGE' 'Discount' 'CARRIAGE' 'DOTCOM POSTAGE' 'Manual' 'Bank Charges' 'SAMPLES' 'AMAZON FEE' 'SUNJAR LED NIGHT NIGHT LIGHT' 'BOXED GLASS ASHTRAY' 'Dotcomgiftshop Gift Voucher £40.00' 'CAMOUFLAGE DOG COLLAR' 'Dotcomgiftshop Gift Voucher £50.00' 'Dotcomgiftshop Gift Voucher £30.00' 'Dotcomgiftshop Gift Voucher £20.00' nan 'OOH LA LA DOGS COLLAR' 'BOYS PARTY BAG' 'GIRLS PARTY BAG' 'Dotcomgiftshop Gift Voucher £10.00' 'PADS TO MATCH ALL CUSHIONS' 'HAYNES CAMPER SHOULDER BAG' 'to push order througha s stock was ' 'ebay' 'Adjust bad debt' 'CRUK Commission']
However, ...
- We've got 33 stock codes which do not follow the conditions of having 5 digits.
- Some make sense such as a "discounts" or "bank charges" but the rest "to push order through a stock was" is not even corrently written.
Also, the first stock code represents popcorn holders.
quantity_by_code = df.groupby('StockCode')['SalesRevenue'].sum().reset_index(name = 'SalesRevenue').sort_values(by = 'SalesRevenue', ascending = False)
fig = px.bar(quantity_by_code.head(100), x = 'StockCode', y = 'SalesRevenue',
title = 'There are two stock codes being far ahead in terms of Sales Revenue.'
)
fig.show()
Let's check what they are.
df[(df['StockCode'] == 'DOT') | (df['StockCode'] == '47566')]['Description'].unique()
array(['DOTCOM POSTAGE', 'PARTY BUNTING', nan], dtype=object)
Postage charges and party hunting? Interesting combination.
px.histogram(df[(df['StockCode'] == 'DOT')], x = 'UnitPrice',
title = 'The postage is rather expensive.')
px.histogram(df[(df['StockCode'] == 'DOT')], x = 'UnitPrice',
color = 'Country',
title = 'And it has been charged only for UK customers.')
quantity_by_description = df.groupby('Description')['SalesRevenue'].sum().reset_index(name = 'SalesRevenue').sort_values(by = 'SalesRevenue', ascending = False)
fig = px.bar(quantity_by_description.head(100), x = 'Description', y = 'SalesRevenue',
title = 'Here things looks different. We have got cakestands as number second!'
)
fig.update_layout(xaxis = dict(tickfont = dict(size = 8)))
fig.show()
That seems to point to the fact that the stock code and the description do not have a 1-1 relationship.
df.groupby('StockCode')['Description'].nunique().sort_values(ascending = False)
StockCode
20713 8
23084 7
21830 6
85175 6
23343 5
..
35969 0
20864 0
84664 0
20863 0
84964B 0
Name: Description, Length: 4070, dtype: int64
df[df['StockCode'] == '20713']['Description'].unique()
array(['JUMBO BAG OWLS', nan, 'wrongly marked. 23343 in box',
'wrongly coded-23343', 'found', 'Found', 'wrongly marked 23343',
'Marked as 23343', 'wrongly coded 23343'], dtype=object)
This does not look good. That means we cannot rely on the description.
Now, let's check at last by country.
revenue_by_country = df.groupby('Country')['SalesRevenue'].sum().reset_index(name = 'SalesRevenue').sort_values(by = 'SalesRevenue', ascending = False)
fig = px.bar(revenue_by_country, x = 'Country', y = 'SalesRevenue',
title = 'The biggest market is UK by far, followed by other Western European countries and Australia?!')
fig.show()
customers_by_country = df.groupby('Country')['CustomerID'].nunique().reset_index(name = 'NumberofCustomers').sort_values(by = 'NumberofCustomers', ascending = False)
fig = px.bar(customers_by_country[customers_by_country['Country'] != 'United Kingdom'], x = 'Country', y = 'NumberofCustomers',
title = 'After UK, Germany and France have over 80 unique customers each.')
fig.show()
Bear in mind, we still have 25% missing customer IDs, so the number could be different.
How did that change in time?
quantity_by_country_date = df.groupby(['Country', 'InvoiceDate'])['SalesRevenue'].sum().reset_index(name = 'SalesRevenue').sort_values(by = 'SalesRevenue', ascending = False)
top_countries = revenue_by_country.nlargest(10, 'SalesRevenue')['Country']
fig = px.bar(quantity_by_country_date[quantity_by_country_date['Country'].isin(top_countries)],
x = 'InvoiceDate', y = 'SalesRevenue', facet_row = 'Country',
color = 'Country', facet_row_spacing = 0.01, height = 800,
title = 'From the top 10 countries, UK, Ireland, Germany and France buy more freququently.')
fig.update_yaxes(matches = None)
fig.for_each_annotation(lambda x: x.update(text = x.text.split("=")[-1]))
# Remove x and y labels
for axis in fig.layout:
if type(fig.layout[axis]) == go.layout.YAxis:
fig.layout[axis].title.text = ''
if type(fig.layout[axis]) == go.layout.XAxis:
fig.layout[axis].title.text = ''
fig.show()
Does UK spend more per unit?
fig = px.histogram(df[df['Country'].isin(top_countries) & (df['UnitPrice'] >= 0) & (df['UnitPrice'] <= 100)],
x = 'UnitPrice', facet_row = 'Country', log_y = True, nbins = 100,
color = 'Country', facet_row_spacing = 0.02, height = 800,
title = 'All the top 9 countries except UK go for a lower unit price range.')
fig.update_yaxes(matches = None)
fig.for_each_annotation(lambda x: x.update(text = x.text.split("=")[-1]))
# Remove x and y labels
for axis in fig.layout:
if type(fig.layout[axis]) == go.layout.YAxis:
fig.layout[axis].title.text = ''
if type(fig.layout[axis]) == go.layout.XAxis:
fig.layout[axis].title.text = ''
fig.show()
- Still, there are less customers from non-UK markets, meaning there is a lower possibility that the range would be wider.
- Nevertheless, even if France is close in terms of customer quantity to UK, they do not go for higher priced units.
quantity_by_code_country = df.groupby(['StockCode', 'Country'])['Quantity'].sum().reset_index(name = 'Quantity').sort_values(by = 'Quantity', ascending = False)
# Get top 10 stock codes by country
top_stock_codes_by_country = pd.DataFrame()
for country in df['Country'].unique():
top_stock_codes = quantity_by_code_country[quantity_by_code_country['Country'] == country].head(20)
top_stock_codes_by_country = top_stock_codes_by_country.append(top_stock_codes)
fig = px.bar(top_stock_codes_by_country[top_stock_codes_by_country['Country'].isin(top_countries)],
x = 'StockCode', y = 'Quantity', facet_row ='Country', facet_row_spacing = 0.02,
color = 'Country', height = 700,
title = 'Top 20 Stock Codes of each Top 10 Country by Quantity: Each seems to be interested in different things.')
fig.update_yaxes(matches = None)
fig.for_each_annotation(lambda x: x.update(text = x.text.split("=")[-1]))
# Remove x and y labels
for axis in fig.layout:
if type(fig.layout[axis]) == go.layout.YAxis:
fig.layout[axis].title.text = ''
if type(fig.layout[axis]) == go.layout.XAxis:
fig.layout[axis].title.text = ''
fig.show()
quantity_by_code_country = df.groupby(['Description', 'Country'])['Quantity'].sum().reset_index(name = 'Quantity').sort_values(by = 'Quantity', ascending = False)
# Get top 10 stock codes by country
top_stock_codes_by_country = pd.DataFrame()
for country in df['Country'].unique():
top_stock_codes = quantity_by_code_country[quantity_by_code_country['Country'] == country].head(20)
top_stock_codes_by_country = top_stock_codes_by_country.append(top_stock_codes)
fig = px.bar(top_stock_codes_by_country[top_stock_codes_by_country['Country'].isin(top_countries)],
x='Description', y = 'Quantity', facet_row ='Country', facet_row_spacing = 0.02,
color = 'Country', height = 700,
title = 'But if you look closer, the items have different descriptions but come from adjacent groups. <br><sup>We have got cake stands, lunch boxes, cutlery, baloons, gift cards, and many party items. </sup>')
fig.update_xaxes(tickangle=45, tickfont=dict(size=8))
fig.update_yaxes(matches = None)
fig.for_each_annotation(lambda x: x.update(text = x.text.split("=")[-1]))
# Remove x and y labels
for axis in fig.layout:
if type(fig.layout[axis]) == go.layout.YAxis:
fig.layout[axis].title.text = ''
if type(fig.layout[axis]) == go.layout.XAxis:
fig.layout[axis].title.text = ''
fig.show()
- Most top 20 items across countries are small-sized, lower-priced items.
- There is no standardized description.
Finally, I'm going to look if items in certain prices are bought more frequently.
fig = px.histogram(df, x = 'Pennies', title = '0.9 pennies might look more attractive to the customers or it is more encountered?')
fig.show()
fig = px.histogram(df, x = 'Pennies', facet_row = 'PoundsRange',
color = 'PoundsRange', height = 800,
title = 'For items with a unit price between £1 and £50, items ending in 0.9 are bought in higher quantities.',
category_orders = {"PoundsRange": bins})
fig.update_yaxes(matches = None)
fig.for_each_annotation(lambda x: x.update(text = x.text.split("=")[-1]))
# Remove x and y labels
for axis in fig.layout:
if type(fig.layout[axis]) == go.layout.YAxis:
fig.layout[axis].title.text = ''
if type(fig.layout[axis]) == go.layout.XAxis:
fig.layout[axis].title.text = ''
fig.show()
This is of course, not certain, since we would need to know what was in stock and compare then the numbers. It might be that it just happens that most items are priced accordingly.